CUSTOMER DELINQUENCY PREDICTION

#load libraries 
library(ggplot2)
library(earth)
library(Rprofet)
library(caret)
library(readr)
library(viridis)
library(gridExtra)
library(kableExtra)
library(patchwork)
library(ggeasy)
library(GGally)
library(ggsci)
library(ROCR)
library(tidyverse)

INTRODUCTION

The aim of this project is to fit a logistic and Multivariate adaptive regression splines (MARS) model using real accounts data from a credit card company in Sioux Falls, South Dakota for predicting if a customer is delinquent or not and determine which model performs best. The concept of binning will be applied in this project. Binning refers to dividing a list of continuous variables into groups (bins) to discover group patterns and impacts. For example, if you have data about a group of people, you might want to arrange their ages into a smaller number of age intervals. The MARS provide a convenient approach to capture the non-linear relationships in the data by assessing cutpoints (knots) similar to step functions. The procedure assesses each data point for each predictor as a knot and creates a linear regression model with the candidate features. Model comparison is done to compare the predictive power of the two models using the Receiver Operator Characteristics curves(ROC) and the Kolmogorov-Smirnov (KS) statistic.

DATA & PRE-PROCESSING

The dataset contains 6,237 observations and the nineteen (19) variables. The dependent variable ‘bad’ indicates the customer did not pay their bill and is now seriously delinquent(default) or not.

Below is a summary of the 19 variables in the dataset. This does not include rows with missing values. The analysis that follows makes use of rows with complete data. Rows with missing observations makes up small portion (5.12%) of the total dataset therefore there are removed. Also, duplicate rows are removed from the dataset. The final dataset after the processing stage had 5916 observations with 18 predictors for predicting customer delinquency.

#load the data
customerretentionMARS <- read_csv("customerretentionMARS.csv")

#summary statistics
summary.stats <- round(as.data.frame((customerretentionMARS)%>%
                                       psych::describe(na.rm = F))%>%
                         dplyr::select(n,mean, sd, median, min, max), 2)

# Summary table
kbl(summary.stats, caption="Summary of customer Retention Data")%>%
  kable_classic(full_width = F, html_font = "Cambria", font_size = 12)
Summary of customer Retention Data
n mean sd median min max
DebtDimId 5916 15842974.14 4859359.63 17492421.00 38282.00 20985297.00
Months_On_Book 5916 32.18 27.73 22.00 6.00 164.00
Credit_Limit 5916 338.53 126.57 300.00 200.00 2500.00
Opening_Balance 5916 251.85 146.55 252.00 -320.00 1893.00
Ending_Balance 5916 241.60 147.01 242.00 -513.00 1969.00
Over_limit_Amount 5916 3.09 11.75 0.00 0.00 192.00
Actual_Min_Pay_Due 5916 21.87 5.62 25.00 0.00 60.00
Total_Min_Pay_Due 5916 24.96 13.43 25.00 0.00 212.00
Net_Payments_During_Cycle 5916 73.78 81.71 50.00 -117.00 985.00
Net_Purchases_During_Cycle 5916 47.32 80.25 14.00 -176.00 775.00
Net_Cash_Advances_During_Cycle 5916 1.13 12.92 0.00 0.00 411.00
Net_Premier_Fees_Billed_During_C 5916 12.34 14.93 7.00 0.00 159.00
Net_Behavior_Fees_Billed_During 5916 3.63 3.75 4.00 -27.00 63.00
Net_Concessions_Billed_During_Cy 5916 0.84 5.66 0.00 -28.00 100.00
Quarterly_Fico_Score 5916 588.46 86.69 598.00 0.00 810.00
Behavior_Score 5916 659.65 23.76 661.00 580.00 721.00
Good_Customer_Score 5916 769.78 98.44 738.00 0.00 1000.00
Utility 5916 0.72 0.33 0.84 -1.71 1.64
Bad 5916 0.12 0.32 0.00 0.00 1.00
# how many  observations have missing values"
missing <- sum(rowSums(is.na(customerretentionMARS))) #321 observations out of 6237

#remove missing observations since there are only few (321)
custRetention <- na.omit(customerretentionMARS)

#remove duplicates rows if there are any
custRetention=custRetention[which(!duplicated(custRetention$DebtDimId)),]

EXPLORATORY DATA ANALYSIS

#Setting theme for plots
theme_set(theme_light(base_size = 10, base_family = "Arial Black"))
 
#Violin dot plots 

# defining the base and theme of the ggplot
g <- ggplot(custRetention, aes(x=as.factor(Bad), y=Months_On_Book, color=as.factor(Bad))) + 
  scale_color_aaas() + 
  labs(x= "Months on Books", y= NULL) +
  theme(legend.position = "none", 
        axis.title = element_text(size = rel(0.8)),
        axis.text = element_text(size=10, family = "Cochin"),
        panel.grid = element_blank())

#calculating average Month on Books for curves and annotation positions
average_month1 <- mean(custRetention[custRetention$Bad==1,]$Months_On_Book)
average_month2 <- mean(custRetention[custRetention$Bad==0,]$Months_On_Book)

# Add violin plot
gg <- g + geom_violin(size=2, alpha=0.25) +
  stat_summary(fun = mean, geom="point", size = 5) + # add the mean as a dot plot \ then add the annotations
  annotate(geom = "text", y =80, x=0.73, size = 3, color="brown", family="Cochin", label=glue::glue("Average\n{round(average_month2,2)}")) +
   annotate(geom = "text", y =80, x=1.5, size = 3, color="brown", family="Cochin", label=glue::glue("Average\n{round(average_month1,2)}")) +
  labs(title=("Violin-dot plot")) + 
  theme(plot.title = element_text(hjust = 0.5))

#defining the position and directions of curves
arrows <-
  tibble(
    x_start = c(0.73, 1.5),
    x_end = c(1, 2),
    y_start = c(73, 80),
    y_end = c(average_month2, average_month1)
  )

#add curves to plot
 ggbox <- gg +  geom_curve(
    data = arrows, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
    arrow = arrow(length = unit(0.07, "inch")), size = 0.4,
    color = "gray20", curvature = -0.4
  )

############################ 
creditLim_dist <- ggplot(custRetention, aes(x = Credit_Limit)) + 
  geom_histogram(color = "white",fill= "firebrick3", binwidth = 50) + 
  labs(x = "Credit Limit", 
       y = "Frequency") +
  scale_x_continuous(limits = c(0, 1250), expand = c(0.007, 0.005)) +
    theme(legend.position = "none", 
        axis.text = element_text(size=10, family ="Cochin"),
        panel.grid = element_blank()) +
   labs(title=("Histogram")) + 
  theme(plot.title = element_text(hjust = 0.5))
 
 
 utility_dist <- ggplot(custRetention, aes(x = Utility)) + 
  geom_histogram(color = "white",fill= "firebrick3") + 
  labs(x = "Utility", 
       y = "Frequency") +
  scale_x_continuous(limits = c(-0.5, 2)) +
    theme(legend.position = "none", 
        axis.text = element_text(size=10, family ="Cochin"),
        panel.grid = element_blank()) +
   
   annotate(geom = "text", y=600, x=-0.2, size=4, color="brown", family="Cochin", label = glue::glue("negative values")) +
   annotate(geom = "text", y=610, x=1.35, size=4, color="brown", family="Cochin", label = glue::glue("values greater \nthan 100%"))
 
 hist_arror <- 
   tibble(
    x_start = c(-0.2, 1.2),
    x_end = c(-0.2, 1.2),
    y_start = c(550, 550),
    y_end = c(100, 200))
 
 utility_hist <- utility_dist + geom_curve(
    data = hist_arror, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
    arrow = arrow(length = unit(0.07, "inch")), size = 2,
    color = "gray20", curvature = 0
  )
 
################################
 
# defining the base and theme of the ggplot
p <- ggplot(custRetention, aes(x=as.factor(Bad), y=Behavior_Score, color=as.factor(Bad))) + 
  scale_color_aaas() + 
  labs(x= "Behavior Score", y= NULL) +
  theme(legend.position = "none", 
        axis.title = element_text(size = rel(0.8)),
        axis.text = element_text(size=10, family = "Cochin"),
        panel.grid = element_blank())

#calculating average Month on Books for curves and annotation positions
average_behav1 <- mean(custRetention[custRetention$Bad==1,]$Behavior_Score)
average_behav0 <- mean(custRetention[custRetention$Bad==0,]$Behavior_Score)

# Add violin plot
pp <- p + geom_violin(size=2, alpha=0.25) +
  stat_summary(fun = mean, geom="point", size = 5) + # add the mean as a dot plot \ then add the annotations
  annotate(geom = "text", y =705, x=1.5, size = 3, color="brown", family="Cochin", label=glue::glue("Average \n{round(average_behav0,2)}")) +
   annotate(geom = "text", y =595, x=1.5, size = 3, color="brown", family="Cochin", label=glue::glue("Average \n{round(average_behav1,2)}"))

#defining the position and directions of curves
arrows2 <-
  tibble(
    x_start = c(1.5, 1.5),
    x_end = c(1, 2),
    y_start = c(700, 600),
    y_end = c(average_behav0, average_behav1)
  )

#add curves to plot
 ppbox <- pp +  geom_curve(
    data = arrows2, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
    arrow = arrow(length = unit(0.07, "inch")), size = 0.4,
    color = "gray20", curvature = -0.4
  )

plot <- ((ggbox + creditLim_dist)) / ((ppbox + utility_hist))
plot
Descriptive Graphs of selected varables

Descriptive Graphs of selected varables

ggsave(filename = "credt.png",
 width = 13, height = 8,
 dpi = 700)  

The above data analysis suggests that on the average, bad customers(delinquent customers) have lower months on books as well as behavior scores compared to good customers. Revolving utilization(utility) is one indicator of how much a customer owes on the account. This rate ranges between 0 and 1 or 0 and 100%. The above histogram shows customers with negative and above 1 utility rates. This is unreasonable therefore its is important to perform some feature engineering on the utility variable. Simply, negative values will be replaced with 0 and values greater than 1 will be replaced with 1.

#replacing negative utility values with 0 and values over 1 with 1
custRetention <- as.data.frame(custRetention%>%mutate(Utility=ifelse(Utility<0, 0, ifelse(Utility>1,1, Utility))))
#Spliting data to train and test
set.seed(222)
index <- createDataPartition( y=custRetention$Bad, p = 0.6, list = F)
train <- as.data.frame(custRetention[index, -1]%>%mutate(Bad=as.factor(Bad)))
custRet_validate <- as.data.frame(custRetention[-index, ]%>%mutate(Bad=as.factor(Bad)))

VARIABLE SELECTION

Variable selection is performed to select a subset of relevant features for use in the model building process. Having irrelevant features in the data can decrease the accuracy of the models and make the model learn based on irrelevant features. Below is a plot of the predictors in order of importance in predicting delinquency.

set.seed(202111)

# prepare training scheme
control <- trainControl(method="cv", number=10)
# train the model
model <- train(Bad~., data=train, method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
#print(importance)
importance_plot <- plot(importance)

importance_plot

I use the Recursive Feature Elimination (RFE) method for selection of variables. This is a widely used algorithm for selecting features that are most relevant in predicting the target variable in a predictive model. RFE applies a backward selection process to find the optimal combination of features. Based on the cross-validation accuracy, 10 attributes are selected. The 10 features selected are Behavior_Score, Good_Customer_Score, Quarterly_Fico_Score, Credit_Limit" Utility, Opening_Balance, Net_Purchases_During_Cycle, and Ending_Balance. I use these variables for both models after binnig.

# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=5)
# run the RFE algorithm
results <- rfe(train[,1:17], train[, 18], sizes=c(1:10), rfeControl=control)
# summarize the results
var_select_plot <- plot(results, type=c("g", "o"))

var_select_plot

Selected Variables

predictors(results)
##  [1] "Behavior_Score"             "Good_Customer_Score"       
##  [3] "Quarterly_Fico_Score"       "Credit_Limit"              
##  [5] "Utility"                    "Opening_Balance"           
##  [7] "Ending_Balance"             "Net_Purchases_During_Cycle"
##  [9] "Net_Payments_During_Cycle"  "Months_On_Book"
#Selecting important variables
data <- as.data.frame(custRetention%>%
  dplyr::select(DebtDimId, Bad, Behavior_Score, Good_Customer_Score, Quarterly_Fico_Score, Credit_Limit, Utility, Opening_Balance, Ending_Balance, Net_Purchases_During_Cycle, Net_Payments_During_Cycle, Months_On_Book))

custRetention <- as.data.frame(custRetention%>%
#  mutate(Bad=as.factor(Bad))%>%
  dplyr::select(Bad, Behavior_Score, Good_Customer_Score, Quarterly_Fico_Score, Credit_Limit, Utility, Opening_Balance, Ending_Balance, Net_Purchases_During_Cycle, Net_Payments_During_Cycle, Months_On_Book))

BINNING OF VARIABLES

Some continuous predictor variables used for building models are binned. Binning is a way to group a number of more or less continuous values into a smaller number of “bins”. Once the bins are created, the information gets compressed into groups which later affects the final mode.l These continuous variables now are treated as factor/categorical variables. Below is the visualization of some binned continuous predictors.

#Binning of importanat variables 
custRetention <- custRetention%>%
  dplyr::mutate(Behavior_Score_Bins=cut(Behavior_Score, breaks=c(-Inf, 600, 670, Inf), right = F),
         Good_Customer_Score_Bins=cut(Good_Customer_Score, breaks=c(-Inf,700,750,820,Inf), right = F),
         Quarterly_Fico_Score_Bins=cut(Quarterly_Fico_Score, breaks=c(-Inf, 550, 650, 642,Inf), right = F),
         Utility_Bins=cut(Utility, breaks=c(-Inf, 0, 0.5, 1, Inf), right = F),
         Opening_Balance_Bins=cut(Opening_Balance, breaks=c(-Inf, 141, 228, 347, Inf), right = F),
         Ending_Balance_Bins = cut(Ending_Balance, breaks =c(-Inf, 131, 242,299, Inf), right = F))%>%
  dplyr::select(Bad, Behavior_Score_Bins, Good_Customer_Score_Bins, Quarterly_Fico_Score_Bins, Credit_Limit, Utility_Bins, Opening_Balance_Bins, Ending_Balance_Bins, Net_Purchases_During_Cycle, Net_Payments_During_Cycle, Months_On_Book)

#Plot of bins

WOEplotter(dat = custRetention, var = 'Behavior_Score_Bins', target = 'Bad')

WOEplotter(dat = custRetention, var = 'Good_Customer_Score_Bins', target = 'Bad')

WOEplotter(dat = custRetention, var = 'Quarterly_Fico_Score_Bins', target = 'Bad')

WOEplotter(dat = custRetention, var = 'Utility_Bins', target = 'Bad')

#WOEplotter(dat = custRetention, var = 'Opening_Balance_Bucket', target = 'Bad')
#WOEplotter(dat = custRetention, var = 'Ending_Balance_Bucket', target = 'Bad')

custRetention <- as.data.frame(custRetention)

#custRetention

DATA PARTITIONING

Separating data into training and validation sets is an important part of evaluating the models. 60% of the data is used for training the models, and a 40% of the data is used for validation. The data is randomly sampled to help ensure that the training and validation sets are similar. By using similar data for training and validation, The effect of data discrepancies can be minimized and better understand the characteristics of the models.

After the models have been trained by using the training set, the models are tested by making predictions against the validation set. Because the data in the validation set already contains known values for the response variable,Bad, it is easy to determine whether the models’ guesses are correct or not.

#Spliting data to train and test
set.seed(222)
index <- createDataPartition( y=custRetention$Bad, p = 0.6, list = F)
custRet_train <- as.data.frame(custRetention[index, ]%>%mutate(Bad=as.factor(Bad)))
custRet_validate <- as.data.frame(custRetention[-index, ]%>%mutate(Bad=as.factor(Bad)))


#save data
write.csv(custRet_train, "custRet_train.csv")
write.csv(custRet_validate, "custRet_validate.csv")

MARS MODEL

Total of 15 out of 18 variables entered the model. However, the model thinned the predictors and retained only 6 of them for the prediction.

#Mars Model
mars_model <- earth(Bad ~ ., 
                    data = custRet_train, glm = list(family="binomial"), degree = 1)
mars_sumary <- summary(mars_model)

kable(mars_sumary$coefficients, 
      caption="Summary of MARS model on Training Dataset")%>%
  kable_classic(full_width = F, html_font = "Cambria", font_size = 12)
Summary of MARS model on Training Dataset
1
(Intercept) 0.4438201
Behavior_Score_Bins[670, Inf) -0.3091668
Behavior_Score_Bins[600,670) -0.2362094
h(375-Credit_Limit) 0.0004092
h(Net_Payments_During_Cycle-70) -0.0020205
h(21-Net_Purchases_During_Cycle) 0.0019510
Quarterly_Fico_Score_Bins[650, Inf) -0.0723766
Quarterly_Fico_Score_Bins[642,650) -0.0947742
Quarterly_Fico_Score_Bins[550,642) -0.0452870
h(Net_Payments_During_Cycle-33) 0.0018394
Good_Customer_Score_Bins[820, Inf) -0.1430765
Good_Customer_Score_Bins[750,820) -0.1356229
Good_Customer_Score_Bins[700,750) -0.1112092
kable(round(cbind(rss=mars_sumary$rss, 
                  rsq=mars_sumary$rsq, 
                  gcv=mars_sumary$gcv, 
                  grsq=mars_sumary$grsq),3), 
      caption="Mars Model Summary")%>%
  kable_classic(full_width = F, html_font = "Cambria", font_size = 12)
Mars Model Summary
rss rsq gcv grsq
319.695 0.101 0.091 0.089

ROC for Mars Model

See “Interpretation of ROC Curve” section for interpretation

mars_prediction_train <- predict(mars_model, type = "response", newdata = custRet_train)
mars_prediction_validate <- predict(mars_model, type = "response", newdata = custRet_validate)

#on training dataset 
my_predictions_marsT <- prediction(mars_prediction_train, custRet_train$Bad, label.ordering = NULL)
roc_perfT <- performance(my_predictions_marsT, measure = "tpr", x.measure = "fpr")

auc_perf_marsT <- performance(my_predictions_marsT, measure = "auc")
auc_train <- as.numeric(auc_perf_marsT@y.values)


my_predictions_mars <- prediction(mars_prediction_validate, custRet_validate$Bad, label.ordering = NULL)
roc_perf <- performance(my_predictions_mars, measure = "tpr", x.measure = "fpr")

#
auc_perf_mars <- performance(my_predictions_mars, measure = "auc")
auc_valid <- as.numeric(auc_perf_mars@y.values)

#ROC Data
roc_data <- as.data.frame(cbind(
  trainx=roc_perfT@x.values[[1]],
  trainy=roc_perfT@y.values[[1]],
  validx=roc_perf@x.values[[1]],
  validy=roc_perf@y.values[[1]]
))
#roc curve 
mars_roc <- ggplot(roc_data) + 
  geom_line(size=1, col="firebrick",aes(x=trainx, y=trainy)) + 
  geom_line(size=1,col= "blue", aes(x=validx, y=validy)) + 
  theme_minimal() + 
  geom_abline(intercept = 0, linetype = "dashed", size=1) + 
  annotate(geom = "text", y=1, x=0.15, size=6, color="brown", family="Cochin", label = glue::glue("Training AUC = {round(auc_train,2)}")) +
  annotate(geom = "text", y=0.92, x=0.15, size=6, color="brown", family="Cochin", label = glue::glue("Validation AUC = {round(auc_valid,2)}")) + xlab("False Positive Rate") + ylab("True Positive Rate") + labs(title=("ROC for MARS Model (Traning vs Validation)")) + 
  theme(plot.title = element_text(hjust = 0.5))
mars_roc
ROC & AUC of MARS Model on training and validations data

ROC & AUC of MARS Model on training and validations data

LOGISTIC MODEL

Logistic regression, also called a logit model, is used to model the dichotomous outcome of credit delinquency In the logit model the log odds of the outcome is modeled as a linear combination of the predictor variables.

#---create logistic model------------

log_model <- glm(Bad ~ ., data = custRet_train, family = "binomial")

log_summary <- summary(log_model)


#using only significant variables
#log_model <- glm(Bad ~ Behavior_Score_Bins + Good_Customer_Score_Bins +  #Quarterly_Fico_Score_Bins + Net_Purchases_During_Cycle + Months_On_Book , data = #custRet_train, family = "binomial")

kable(round(log_summary$coefficients,3), caption="Summary of Logistic Model on Training data")%>%
  kable_classic(full_width = F, html_font = "Cambria", font_size = 12)
Summary of Logistic Model on Training data
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.900 0.524 1.719 0.086
Behavior_Score_Bins[600,670) -0.960 0.364 -2.636 0.008
Behavior_Score_Bins[670, Inf) -2.245 0.422 -5.314 0.000
Good_Customer_Score_Bins[700,750) -0.678 0.140 -4.830 0.000
Good_Customer_Score_Bins[750,820) -0.976 0.202 -4.826 0.000
Good_Customer_Score_Bins[820, Inf) -1.434 0.287 -4.993 0.000
Quarterly_Fico_Score_Bins[550,642) -0.368 0.119 -3.090 0.002
Quarterly_Fico_Score_Bins[642,650) -1.841 0.596 -3.091 0.002
Quarterly_Fico_Score_Bins[650, Inf) -1.046 0.232 -4.513 0.000
Credit_Limit -0.001 0.001 -0.608 0.543
Utility_Bins[0.5,1) -0.095 0.483 -0.197 0.844
Utility_Bins[1, Inf) 0.078 0.517 0.151 0.880
Opening_Balance_Bins[141,228) -0.043 0.305 -0.141 0.888
Opening_Balance_Bins[228,347) -0.029 0.353 -0.083 0.934
Opening_Balance_Bins[347, Inf) -0.018 0.452 -0.040 0.968
Ending_Balance_Bins[131,242) -0.303 0.474 -0.640 0.522
Ending_Balance_Bins[242,299) -0.247 0.508 -0.487 0.626
Ending_Balance_Bins[299, Inf) -0.586 0.549 -1.067 0.286
Net_Purchases_During_Cycle -0.004 0.001 -2.458 0.014
Net_Payments_During_Cycle 0.002 0.001 1.133 0.257
Months_On_Book -0.005 0.003 -1.691 0.091

ROC for Logistic Model

See “Interpretation of ROC Curves” section for interpretation

#----predict from logistic model------


#predict new values on training and validation dataset 
log_prediction_train <- predict(log_model, type = "response", newdata = custRet_train)
log_prediction_validate <- predict(log_model, type = "response", newdata = custRet_validate)

#on training dataset 
my_predictions_logT <- prediction(log_prediction_train, custRet_train$Bad, label.ordering = NULL)
roc_perf_logT <- performance(my_predictions_logT, measure = "tpr", x.measure = "fpr")


auc_perf_logT <- performance(my_predictions_logT, measure = "auc")
log_auc_train <- as.numeric(auc_perf_logT@y.values)

my_predictions_log <- prediction(log_prediction_validate, custRet_validate$Bad, label.ordering = NULL)
roc_perf_log <- performance(my_predictions_log, measure = "tpr", x.measure = "fpr")


auc_perf_log <- performance(my_predictions_log, measure = "auc")
log_auc_valid <- as.numeric(auc_perf_log@y.values)


#ROC Data
roc_data_log <- as.data.frame(cbind(
  trainx=roc_perf_logT@x.values[[1]],
  trainy=roc_perf_logT@y.values[[1]],
  validx=roc_perf_log@x.values[[1]],
  validy=roc_perf_log@y.values[[1]]
))
#roc curve 
logit_roc <- ggplot(roc_data_log) + 
  geom_line(size=1, col="firebrick",aes(x=trainx, y=trainy)) + 
  geom_line(size=1,col= "blue", aes(x=validx, y=validy)) + 
  theme_minimal() + 
  geom_abline(intercept = 0, linetype = "dashed", size=1) + 
  annotate(geom = "text", y=1, x=0.15, size=6, color="brown", family="Cochin", label = glue::glue("Training AUC = {round(log_auc_train,2)}")) +
  annotate(geom = "text", y=0.92, x=0.15, size=6, color="brown", family="Cochin", label = glue::glue("Validation AUC = {round(log_auc_valid,2)}")) + 
  xlab("False Positive Rate") + 
  ylab("True Positive Rate") + 
  labs(title=("ROC for Logistic Model (Traning vs Validation)")) + 
  theme(plot.title = element_text(hjust = 0.5))


logit_roc
ROC & AUC of Logistic Model on training and validations data

ROC & AUC of Logistic Model on training and validations data

ggsave(filename = "roc.png",
 width = 15, height = 8,
 dpi = 700)

INTERPRETATION OF ROC CURVES

The ROC curve is created by evaluating the class probabilities for the model across a continuum of thresh-holds. For each candidate threshold, the resulting true-positive rate(sensitivity) and the false-positive rate (specificity) are plotted against each other. The figures above show the results of this process for the credit card data for two models; MARS and logistic. The ROC plots is a helpful tool for choosing the threshold that appropriately maximizes the trade-off between sensitivity and specificity. In comparing the two models with ROC curves, a perfect model would have a sensitivity and specificity of 100% - Graphically, the curve would be a single steep between (0,0) and (1,1) and remain constant from (0,1) to (1,1). The area under the curve (AUC) of such a perfect model would be equal to 1. An ineffective model will have its ROC curve that follows the 45 degrees diagonal line and would have an AUC of approximately 0.5.

ROC curves with corresponding Area Under Curve (AUC) values are made from the training and validation datasets for each model.In comparing the logistic and the MARS model, ROC plots and AUC was generated from the validation dataset. It can be seen that the logistic model and the MARS model have the same AUC values therefore both models can be said to have the same predictive power in this case.

THE KOLMOGOROV-SMIRNOV (KS) CURVE & STATISTIC

The Kolmogorov-Smirnov (KS) statistic is a performance statistic which measures the discriminatory power of a model. It is the largest difference between the True Positive Rate(TPR) and False Positive Rate(FPR) at a given percentile. It looks at the maximum difference between the distribution of cumulative events and cumulative non-events. It is a very popular metric used in credit risk and response modeling. The Kolmogorov–Smirnov test a very efficient way to determine if two samples are significantly different from each other. In predictive analytics, the test is used to determine if predictions from different models differ significantly from each other. The higher the value, the better the model.

#----KS charts----

test <- as.data.frame(cbind(roc_perf_log@x.values[[1]], roc_perf_log@y.values[[1]]))
Percentile <- NULL
Difference <- NULL 
for (i in 1:nrow(test)){
  test[i, 3] = i/nrow(test)
  test[i, 4]= abs(test[i,2]-test[i,1])
}
colnames(test) <- c("FPR", "TPR", "Percentile", "Difference")

#Row with the maximum difference
max_diff <- test[test$Difference==max(test$Difference),]

#Maximum Difference
#max_diff$Difference

logit_ks <- ggplot(test) + 
  geom_line(aes(x=Percentile, y=TPR), col="firebrick", size=1) +
  geom_line(aes(x=Percentile, y=FPR), col="blue", size=1) +
  geom_abline(intercept = 0, linetype="dashed") +
  geom_vline(xintercept = max_diff$Difference, linetype="dashed") +
  labs(title = "KS Chart for Logit Predictions", y="TPR/FPR") +
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate(geom = "text", y=0.92, x=max_diff$Difference, size=6, color="brown", family="Cochin", label = glue::glue("D = {round(max_diff$Difference,2)}"))


test <- as.data.frame(cbind(roc_perf@x.values[[1]], roc_perf@y.values[[1]]))
Percentile <- NULL
Difference <- NULL 
for (i in 1:nrow(test)){
  test[i, 3] = i/nrow(test)
  test[i, 4]= abs(test[i,2]-test[i,1])
}
colnames(test) <- c("FPR", "TPR", "Percentile", "Difference")


max_diff <- test[test$Difference==max(test$Difference),]

#Maximum Difference
#max_diff$Difference

mars_ks <- ggplot(test) + 
  geom_line(aes(x=Percentile, y=TPR), col="firebrick", size=1) +
  geom_line(aes(x=Percentile, y=FPR), col="blue", size=1) +
  geom_abline(intercept = 0, linetype="dashed") +
  geom_vline(xintercept = max_diff$Difference, linetype="dashed") +
  labs(title = "KS Chart for MARS Predictions", y="TPR/FPR") +
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate(geom = "text", y=0.92, x=max_diff$Difference, size=6, color="brown", family="Cochin", label = glue::glue("D = {round(max_diff$Difference,2)}"))


((mars_ks + logit_ks))

ggsave(filename = "ks.png",
 width = 15, height = 8,
 dpi = 700)

In this case, the Logistic Model has the largest KS statistic. This means that the predictions of the Logistic model is significantly different than that of the MARS model. Therefore the logistic model provides a better model in terms of predictions.

CONCLUSION

The MARS and Logistic models are two good models in predicting customer delinquency. Even though the AUC of both models on the validation dataset are equal, based on the KS statistic, the logistic model outperforms the MARS model since it has the highest value.